Q1. 지역에 따라서 선호하는 게임 장르가 다를까 라는 질문에 대답
Q2. 연도별 게임의 트렌드가 있을까 라는 질문에 대답
Q3. 출고량이 높은 게임에 대한 분석 및 시각화 프로세스가 포함
=>다음 분기에 어떤 게임을 설계해야 할까
Q1. 지역에 따라서 선호하는 게임 장르가 다를까 라는 질문에 대답
Q2. 연도별 게임의 트렌드가 있을까 라는 질문에 대답
Q3. 출고량이 높은 게임에 대한 분석 및 시각화 프로세스가 포함
=>다음 분기에 어떤 게임을 설계해야 할까
Name : 게임의 이름입니다.
Platform : 게임이 지원되는 플랫폼의 이름입니다.
Year : 게임이 출시된 연도입니다.
Genre : 게임의 장르입니다.
Publisher : 게임을 제작한 회사입니다.
NA_Sales : 북미지역에서의 출고량입니다.
EU_Sales : 유럽지역에서의 출고량입니다.
JP_Sales : 일본지역에서의 출고량입니다.
Other_Sales : 기타지역에서의 출고량입니다.
df = pd.read_csv('vgames.csv' , index_col=0) #파일불러오고
df=df.drop(columns='Publisher') # 솔직히 만든회사는 필요없음
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16598 entries, 1 to 16598 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 16598 non-null object 1 Platform 16598 non-null object 2 Year 16597 non-null float64 3 Genre 16597 non-null object 4 NA_Sales 16598 non-null float64 5 EU_Sales 16598 non-null float64 6 JP_Sales 16598 non-null float64 7 Other_Sales 16598 non-null float64 dtypes: float64(5), object(3) memory usage: 1.1+ MB
df.isna().sum() #결측치개수확인
Name 0 Platform 0 Year 1 Genre 1 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 dtype: int64
df[(df['Year'].isnull()) | (df['Genre'].isnull())].head() #결측치만 뽑아서 볼게
#2개는 도무지 안나옴
df=df.dropna() #결측치 제거
df.Year=df['Year'].astype('int') # Year정수변환
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df[df['Year'] > 1900].reset_index(drop=True) #1900년보다 작은애들 제거 (이상치)
df[df.duplicated()].index #중복값 확인
df=df.drop(index=9180)
df=df.reset_index(drop=True)
df.duplicated().sum()#중복치볼래
0
df.isnull().sum() #결측치최종확인
Name 0 Platform 0 Year 0 Genre 0 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 dtype: int64
df.nunique() #칼럼별 고유값의 개수 =>장르개수 파악
Name 11489 Platform 31 Year 44 Genre 13 NA_Sales 409 EU_Sales 305 JP_Sales 244 Other_Sales 157 dtype: int64
df.NA_Sales=df['NA_Sales'].astype('float')
df.EU_Sales=df['EU_Sales'].astype('float')
df.JP_Sales=df['JP_Sales'].astype('float')
df.Other_Sales=df['Other_Sales'].astype('float') #===> object에서 float으로 변경 나중에 합쳐야함
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16589 entries, 0 to 16588 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 16589 non-null object 1 Platform 16589 non-null object 2 Year 16589 non-null int64 3 Genre 16589 non-null object 4 NA_Sales 16589 non-null float64 5 EU_Sales 16589 non-null float64 6 JP_Sales 16589 non-null float64 7 Other_Sales 16589 non-null float64 dtypes: float64(4), int64(1), object(3) memory usage: 1.0+ MB
Total=[]
for i in range(df.shape[0]):
Total.append(df.iloc[i,4:].sum())
df['Total'] = Total
#총
labels=df.Genre.value_counts().index
explode = [0.3,0.1,0,0,0,0,0,0,0,0,0,0,0]
sizes = df.Genre.value_counts().values
# visual
plt.figure(figsize = (7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=sns.color_palette('Set2'), autopct='%1.1f%%')
plt.title('Games According to Genre',fontsize = 17,color = 'black') #전체 데이터에서 장르의 비율
platform1 = df['Platform'].unique()
platform1
array(['DS', 'Wii', 'PSP', 'PS3', 'PC', 'PS', 'GBA', 'PS4', 'PS2', 'XB',
'X360', 'GC', '3DS', '2600', 'SAT', 'GB', 'NES', 'DC', 'N64',
'XOne', 'SNES', 'WiiU', 'PSV', 'GEN', 'SCD', 'WS', 'NG', 'TG16',
'3DO', 'GG', 'PCFX'], dtype=object)labels=df.Platform.value_counts().index
explode = [0.3,0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
sizes = df.Platform.value_counts().values
# visual
plt.figure(figsize = (12,15))
plt.pie(sizes, explode=explode, labels=labels, colors=sns.color_palette('Set2'), autopct='%1.1f%%')
plt.title('Games According to Platform',fontsize = 17,color = 'black') #전체 데이터에서 플렛폼의 비율
sns.countplot(df.Genre)
plt.xticks(rotation=90)
plt.title("Genre",color="blue",fontsize=15) #장르별 총 갯수
sns.countplot(df.Platform)
plt.xticks(rotation=90)
plt.title("Platforms",color="blue",fontsize=30) #플렛폼 종류별 갯수
# game = df.loc[df['Name']!='Wii Sports',['Name','NA_Sales']]
# game = game.sort_values('NA_Sales', ascending=False)
# game = game.head()
# fig = px.pie(game, names='Name', values='NA_Sales', template='seaborn')
# fig.update_traces(rotation=90, pull=0.06, textinfo="percent+label")
# fig.show()
# game1 = df.loc[df['Name']!='Wii Sports',['Name','EU_Sales']]
# game1= game1.sort_values('EU_Sales', ascending=False)
# game1.head()
# fig = px.pie(game1, names='Name', values='EU_Sales', template='seaborn')
# fig.update_traces(rotation=90, pull=0.06, textinfo="percent+label")
# fig.show()
# game2 = df.loc[:,['Name','JP_Sales']]
# game2 = game2.sort_values('JP_Sales', ascending=False)
# game2.head()
# fig = px.pie(game2, names='Name', values='JP_Sales', template='seaborn')
# fig.update_traces(rotation=90, pull=0.06, textinfo="percent+label")
# fig.show()
# game3 = df.loc[:,['Name','Other_Sales']]
# game3= game3.sort_values('Other_Sales', ascending=False)
# game3.head()
# fig = px.pie(game3, names='Name', values='Other_Sales', template='seaborn')
# fig.update_traces(rotation=90, pull=0.06, textinfo="percent+label")
# fig.show()
#각 나라별 게임 판매량을 파이형식으로 나타내려 했는데 런타임에 계속 걸려서 주석처리.(미국,유럽의 경우 wii sports각 압도적이어서 제외하고 넣음)
시각화
genre = df['Genre'].unique() #장르 개수
genre_s = sorted(genre)
genre_s
['Action', 'Adventure', 'Fighting', 'Misc', 'Party', 'Platform', 'Puzzle', 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports', 'Strategy']
na_sales=[]
eu_sales=[]
jp_sales=[]
other_sales=[] #각 지역의 판매량을 장르 별로 총합해서 병합
for i in genre_s:
val=df[df.Genre==i]
na_sales.append(val.NA_Sales.sum())
eu_sales.append(val.EU_Sales.sum())
jp_sales.append(val.JP_Sales.sum())
other_sales.append(val.Other_Sales.sum())
fig = go.Figure()
fig.add_trace(go.Bar(x=na_sales,
y=genre_s,
name='North America Sales',
marker_color='skyblue',
orientation='h'))
fig.add_trace(go.Bar(x=eu_sales,
y=genre_s,
name='Europe Sales',
marker_color='cornsilk',
orientation='h'))
fig.add_trace(go.Bar(x=jp_sales,
y=genre_s,
name='Japan Sales',
marker_color='burlywood',
orientation='h'))
fig.add_trace(go.Bar(x=other_sales,
y=genre_s,
name='Other Region Sales',
marker_color='hotpink',
orientation='h'))
fig.update_layout(title_text='지역별 판매량이 높은 장르',xaxis_title="판매량",yaxis_title="장르",
barmode='stack')
fig.show() #지역별 장르 판매량을 시각화
모든 지역 Action 장르가 1순위
2순위는 Sports
연도별 게임의 트렌드가 있을까
xaction=df[df.Genre=="Action"]
xsports=df[df.Genre=="Sports"]
xmisc=df[df.Genre=="Misc"]
xrole=df[df.Genre=="Role-Playing"]
xshooter=df[df.Genre=="Shooter"]
xadventure=df[df.Genre=="Adventure"]
xrace=df[df.Genre=="Racing"]
xplatform=df[df.Genre=="Platform"]
xsimulation=df[df.Genre=="Simulation"]
xfight=df[df.Genre=="Fighting"]
xstrategy=df[df.Genre=="Strategy"]
xpuzzle=df[df.Genre=="Puzzle"] #장르를 종류별로 다시 저장
trace1 = go.Bar(
x=xaction.groupby("Platform")["Total"].sum().index,
y=xaction.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Action",
marker=dict(color="rgb(119,172,238)"))
trace2 = go.Bar(
x=xsports.groupby("Platform")["Total"].sum().index,
y=xsports.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Sports",
marker=dict(color='rgb(21,90,174)'))
trace3 = go.Bar(
x=xrace.groupby("Platform")["Total"].sum().index,
y=xrace.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Racing",
marker=dict(color="rgb(156,245,163)"))
trace4 = go.Bar(
x=xshooter.groupby("Platform")["Total"].sum().index,
y=xshooter.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Shooter",
marker=dict(color="rgb(14,135,23)"))
trace5 = go.Bar(
x=xmisc.groupby("Platform")["Total"].sum().index,
y=xmisc.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Misc",
marker=dict(color='rgb(252,118,103)'))
trace6 = go.Bar(
x=xrole.groupby("Platform")["Total"].sum().index,
y=xrole.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Role Playing",
marker=dict(color="rgb(226,28,5)"))
trace7 = go.Bar(
x=xfight.groupby("Platform")["Total"].sum().index,
y=xfight.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Fighting",
marker=dict(color="rgb(247,173,13)"))
trace8 = go.Bar(
x=xplatform.groupby("Platform")["Total"].sum().index,
y=xplatform.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Platform",
marker=dict(color="rgb(242,122,13)"))
trace9 = go.Bar(
x=xsimulation.groupby("Platform")["Total"].sum().index,
y=xsimulation.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Simulation",
marker=dict(color="rgb(188,145,202)"))
trace10 = go.Bar(
x=xadventure.groupby("Platform")["Total"].sum().index,
y=xadventure.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Adventure",
marker=dict(color='rgb(104,57,119)'))
trace11 = go.Bar(
x=xstrategy.groupby("Platform")["Total"].sum().index,
y=xstrategy.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Strategy",
marker=dict(color='rgb(245,253,104)'))
trace12 = go.Bar(
x=xpuzzle.groupby("Platform")["Total"].sum().index,
y=xpuzzle.groupby("Platform")["Total"].sum().values,
opacity=0.75,
name = "Puzzle",
marker=dict(color='rgb(138,72,40)'))
data = [trace1, trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11,trace12]
layout = go.Layout(barmode='stack',
title='Total According to Platform and Genre',
xaxis=dict(title='Platform'),
yaxis=dict( title='Total(In Millions)'),
paper_bgcolor='white',
plot_bgcolor='white'
)
fig = go.Figure(data=data, layout=layout)
fig.show() #플렛폼별 장르에 대한 총 판매량
# Total = [sum for i in]
# x = df.groupby(['Year']).count()
# x = x['Total']
# y = x.index.astype(int)
# plt.figure(figsize=(20,14))
# colors = sns.color_palette("muted")
# ax = sns.barplot(y = y, x = x, orient='h', palette=colors)
# ax.set_xlabel(xlabel='Total_sales', fontsize=19)
# ax.set_ylabel(ylabel='Year', fontsize=22)
# ax.set_title(label='Total Sales per Year', fontsize=22)
# plt.show(); #연도별 전체게임의 판매량
y = df.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
zz = sns.barplot(y = y, x = x)
zz.set_xlabel(xlabel='Year', fontsize=17)
zz.set_xticklabels(labels = x, fontsize=12, rotation=45)
zz.set_ylabel(ylabel='Total_Sales', fontsize=17)
zz.set_title(label='Total Sales per Year', fontsize=22)
plt.show(); #연도별 총 판매량.,
plt.figure(figsize=(25, 10))
a = sns.barplot(x='Year', y='Count', data=top_genre_count)
index = 0
for value in top_genre_count['Count'].values:
a.text(index, value + 5, str(gerne[index] + '----' +str(value)), color='#000', size=14, rotation= 90, ha="center")
index += 1
plt.xticks(rotation=0)
plt.show() #시각화
top_platform = df[['Year', 'Platform']]
top_platform_df = top_platform.groupby(by=['Year', 'Platform']).size().reset_index(name='Count')
top_platform_idx = top_platform_df.groupby(by=['Year'])['Count'].transform(max) == top_platform_df['Count']
top_platform_count = top_platform_df[top_platform_idx].reset_index(drop=True)
top_platform_count = top_platform_count.drop_duplicates(subset=["Year", "Count"], keep='last').reset_index(drop=True)
platform= top_platform_count['Platform']
plt.figure(figsize=(25, 10))
a = sns.barplot(x='Year', y='Count', data=top_platform_count)
index = 0
for value in top_platform_count['Count'].values:
a.text(index, value + 5, str(platform[index] + '----' +str(value)), color='#000', size=14, rotation= 90, ha="center")
index += 1
plt.xticks(rotation=0)
plt.show()
1990~2002 년까지는 Sports가 주류였으며 그 이후부턴 Action게임이 주류를 이룸
출고량이 높은 게임에 대한 분석 및 시각화 프로세스가 포함
zz=df[df['Name'].isin(['Grand Theft Auto V', 'Wii Sports','Super Mario Bros.','Tetris','Mario Kart Wii'])].sort_values(by=['Name'])
zz #상위 5개 게임에대한 세부내용 데이터 셋
ww=df[df['Platform'].isin(['DS'])].sort_values(by=['Year'])
y = ww.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr = sns.barplot(y = y, x = x)
rr.set_xlabel(xlabel='Year', fontsize=17)
rr.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr.set_title(label='Total Sales per Year(DS)', fontsize=22)
plt.show();
tt=df[df['Platform'].isin(['PS'])].sort_values(by=['Year'])
y = tt.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr1 = sns.barplot(y = y, x = x)
rr1.set_xlabel(xlabel='Year', fontsize=17)
rr1.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr1.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr1.set_title(label='Total Sales per Year(PS)', fontsize=22)
plt.show();
tt=df[df['Platform'].isin(['PS2'])].sort_values(by=['Year'])
y = tt.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr1 = sns.barplot(y = y, x = x)
rr1.set_xlabel(xlabel='Year', fontsize=17)
rr1.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr1.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr1.set_title(label='Total Sales per Year(PS2)', fontsize=22)
plt.show();
tt=df[df['Platform'].isin(['PS3'])].sort_values(by=['Year'])
y = tt.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr1 = sns.barplot(y = y, x = x)
rr1.set_xlabel(xlabel='Year', fontsize=17)
rr1.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr1.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr1.set_title(label='Total Sales per Year(PS3)', fontsize=22)
plt.show();
tt=df[df['Platform'].isin(['3DS'])].sort_values(by=['Year'])
y = tt.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr1 = sns.barplot(y = y, x = x)
rr1.set_xlabel(xlabel='Year', fontsize=17)
rr1.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr1.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr1.set_title(label='Total Sales per Year(3DS)', fontsize=22)
plt.show();
tt=df[df['Platform'].isin(['PS4'])].sort_values(by=['Year'])
y = tt.groupby(['Year']).sum()
y = y['Total']
x = y.index.astype(int)
plt.figure(figsize=(15,7))
rr1 = sns.barplot(y = y, x = x)
rr1.set_xlabel(xlabel='Year', fontsize=17)
rr1.set_xticklabels(labels = x, fontsize=12, rotation=45)
rr1.set_ylabel(ylabel='Total_Sales', fontsize=17)
rr1.set_title(label='Total Sales per Year(PS4)', fontsize=22)
plt.show();